Preprocessing Dataset

Libraries

Library for read dataset.

library(readr)

Library for data frames processing.

library(dplyr)
library(tidyr)

Library for R Markdown.

library(rmarkdown)
library(knitr)

Library for data presentation.

library(scales)

Library for manage strings.

library(stringr)

Load dataset

Set the path of the dataframe file.

path = "../virusTotal/data/virusTotal.csv"

Load dataset.

df <- read_csv(path)

Statistics

Dimensions.

dim(df)
## [1] 183 447

Types

View witch types are in the dataset.

col_types_all <- 
  df %>% 
  sapply(typeof) %>% 
  unlist()

col_types_table <- 
  col_types_all %>% 
  table()

col_types <- 
  col_types_table %>% 
  as.vector()

names(col_types) <- names(col_types_table)
## character    double   logical 
##       158       204        85

As can be seen there are the three expected types: character, double and logical.

NA

Percentaje of NA values

Define function to see the amount of NA values in the dataframe.

percent_of_NA <- 
  function(df){
    num_of_NA <- 
      df %>% is.na() %>% sum()
    num_of_values <- 
      df %>% dim() %>% prod()
    percent_of_NA <- 
      (num_of_NA / num_of_values) %>% 
      percent()
    return(percent_of_NA)
  }
percent_of_NA(df)
## [1] "37%"

Columns with NA

Define functions to see the NA in columns.

num_of_NA_by_column <- 
  function(df){
    df %>% is.na() %>% colSums()
  }
remove_0 <- 
  function(x) x[x!=0]
names_of_colums_with_NA <- 
  function(df)
    df %>% 
      num_of_NA_by_column() %>% 
      remove_0 %>% 
      names()
percentaje_of_cols_with_NA <-
  function(df)
    (length(names_of_colums_with_NA(df)) / ncol(df)) %>% 
    percent()

Compute the percentaje of cols with NA.

percentaje_of_cols_with_NA(df)
## [1] "74%"

Inspect if there are columns full of NA.

is_full_of_NA <- function(col){
  num_of_NA <- 
    col %>% 
    is.na() %>% 
    sum()
  return(num_of_NA == length(col))
}
cols_full_of_NA <- 
  df %>% 
  select_if(is_full_of_NA) %>% 
  names()
##  [1] "authentihash"                  "scans.Bkav.result"            
##  [3] "scans.CMC.result"              "scans.ALYac.result"           
##  [5] "scans.Malwarebytes.result"     "scans.K7AntiVirus.result"     
##  [7] "scans.Baidu.result"            "scans.SUPERAntiSpyware.result"
##  [9] "scans.Gridinsoft.result"       "scans.ViRobot.result"         
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"         
## [13] "scans.VBA32.result"            "scans.Zoner.result"           
## [15] "scans.Panda.result"            "scans.Elastic.result"         
## [17] "scans.Cylance.result"          "scans.SentinelOne.result"

As can be seen there are many columns that are full of NA, so can be deleted.

df <- 
  select(df, -all_of(cols_full_of_NA))

Colums with the same value

Maybe there are columns that has the same value along all the vector, so are useless.

Define function to remove these columns.

different_values <- 
  function(x)
    x %>% na.omit() %>% unique() %>% length()
remove_columns_with_the_same_value <- 
  function(df)
      select_if(df, function(col) different_values(col) > 1)

Apply function.

num_of_cols_after_remove <- 
  df %>% 
  remove_columns_with_the_same_value() %>% 
  ncol()

Calculate the number of columns with same value.

ncol(df) - num_of_cols_after_remove
## [1] 147

Awesom! Many colums found. Let’s remove them.

df <- 
  remove_columns_with_the_same_value(df)

Inspecting dataframe

Now let’s deeply inspect into the dataframe.

View dataframe

View dataframe.

Renaming

The column “…1” is the row number, so “n” will be a better name. The “…JSON” it’s a bad name, just “json” is fine.

df <- 
  df %>% 
  rename(n = ...1, json=..JSON)

Removing cols

There are many duplicated cols, hashes & dates that can be removed, also many useless.

Dates

There are many dates in the dataset, that are not relevant for virus analysis. So let’s remove them.

Define a function for check if a col is of type Date.

not <- 
  function(x) !x

get_element <- 
  function(x, index) x[index]

is_date_col <- 
  function(col, pattern="^[:digit:]{4}[-:/][:digit:]{2}[-:/][:digit:]{2}")
    col %>% 
    as.character() %>% 
    na.omit() %>% 
    get_element(1) %>% 
    str_detect(pattern)

Columns detected.

df %>% 
  select_if(is_date_col) %>% 
  head() %>% 
  paged_table()

Define function for remove cols by a predicate.

remove_col_if <- 
  function(df, fun){
    cols_to_delete <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df <- 
      df %>% 
      select(-cols_to_delete)
    return(df)
  }

Remove them.

df <- 
  remove_col_if(df, is_date_col)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols_to_delete)` instead of `cols_to_delete` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Hashes

There are many hashes cols that don’t really provide useful information. So remove them.

hashes <- 
  c("")

df <- 
  df %>% 
  select(
    -vhash,
    -sha256,
    -sha1,
    -scan_id,
    -ssdeep,
    -md5,
    -additional_info.androguard.certificate.serialnumber,
    -additional_info.androguard.certificate.thumbprint,
    -additional_info.exiftool.ZipCRC
  )

Scans

There are many scans of different antivirus, that has very similar information. Just keep the scan with less NA values.

Get the best col.

scans_col_names <- 
  df %>% 
  colnames() %>% 
  str_match_all("scans.[:alpha:]*.result") %>% 
  unlist()

scan_na_by_col <- 
  df %>% 
  select(all_of(scans_col_names)) %>%
  num_of_NA_by_column()

scan_col_witch_min_na <- 
  scan_na_by_col %>% 
  which.min()

best_scanner_colname <- 
  scan_na_by_col %>% 
  names() %>% 
  get_element(scan_col_witch_min_na)

best_scanner_col <- 
  df %>% 
  select(all_of(best_scanner_colname))

best_scanner_name <- 
  best_scanner_colname %>% 
  str_split("[.]") %>% 
  unlist() %>% 
  get_element(2)
## [1] "Best scan col: scans.Fortinet.result"
## [1] "Best scanner: Fortinet"

Drop all scans but the best.

col_index_scanners <- 
  df %>% 
  colnames() %>% 
  str_detect("scans") %>%
  unlist() %>% 
  which()

df <- 
  df %>% 
  select(-all_of(col_index_scanners)) %>% 
  cbind(best_scanner_col)

Individual columns

Reasons:

  • json column contains all the row as JSON.
  • permalink is the URL where Virus Total has the virus file.
  • Main.Activity & Package are strings with all different values.
  • FileTypeExtension, ZipFileName & MIMEType has same values as FileType.
  • ZipBitFlag doesn’t seems to matter.
  • additional_info.magic has the vesion of the ZIP file, that doesn’t seems to matter.
  • Subject.DN is the JSON fragment that has all the information about the subject, but these data are decomposed in the rest of Subject cols.
df <- 
  df %>% 
  select(
    -json,
    -permalink,
    -additional_info.androguard.AndroidApplicationInfo,
    -additional_info.androguard.Main.Activity,
    -additional_info.exiftool.MIMEType,
    -additional_info.exiftool.FileTypeExtension,
    -additional_info.exiftool.ZipFileName,
    -additional_info.magic,
    -additional_info.androguard.Package,
    -additional_info.androguard.certificate.Subject.DN
  )

Groups of columns

Define a function to remove cols which name match a pattern.

remove_cols_which_name_match <- 
  function(df, pattern){
    cols_to_remove <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_removed_cols <- 
      df %>% 
      select(-all_of(cols_to_remove))
    return(df_removed_cols)
  }

Remove groups.

Reasons:

  • Issuer group has the same information as Subject group.
  • CompressedView and RiskIndicator.APK groups have the same information as file_type group
df <- 
  df %>% 
  remove_cols_which_name_match("^additional_info.androguard.certificate.Issuer.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.compressedview.extensions.[:alpha:]*$") %>% 
  remove_cols_which_name_match("^additional_info.androguard.RiskIndicator.APK.[:alpha:]*$")

View results

Define a function for sort columns.

sort_cols <- 
  function(df){
    df <- 
      df %>% 
      select(order(colnames(df)))
    additionalInfo_cols_logical <- 
      df %>% 
      colnames() %>% 
      str_detect("additional_info")
    additionalInfo_cols <- 
      df %>% 
      select(which(additionalInfo_cols_logical))
    not_additionalInfo_cols <- 
      df %>% 
      select(which(!additionalInfo_cols_logical)) %>% 
      select(n, size, everything())
    return(cbind(not_additionalInfo_cols, additionalInfo_cols))
  }

Sort columns.

df <- 
  sort_cols(df)

View results.

Replacing values

Replace “Unknown” and “?” by NA

There are some columns that hast the value “Unknown” or “?” instead of NAs. So let’s replace them.

Define a function to replace values in cols that satisfy a predicate.

replace_when <- 
  function(df, fun, value, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace(col, which(col==value), replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }

Replace ? and Unknown for NAs.

df <- 
  df %>% 
  replace_when(function(col) any(str_detect(col, fixed("Unknown"))), "Unknown", NA) %>% 
  replace_when(function(col) any(str_detect(col, fixed("?"))), "?", NA)

Replace NA for 0

Permissions colums

The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.

replace_na_which_colname_match <- 
  function(df, pattern, replacement){
    cols_to_replace <- 
      df %>% 
      colnames() %>% 
      str_which(pattern)
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
replace_na_when <- 
  function(df, fun, replacement){
    cols_to_replace <- 
      df %>% 
      select_if(fun) %>% 
      colnames()
    df_replaced_cols <- 
      df %>% 
      select(all_of(cols_to_replace)) %>% 
      sapply(function(col) replace_na(col, replacement))
    df_without_replaced_cols <- 
      df %>% 
      select(-all_of(cols_to_replace))
    return(cbind(df_without_replaced_cols, df_replaced_cols))
  }
pattern <- "additional_info.androguard.RiskIndicator.PERM"

df <- 
  df %>% 
  replace_na_which_colname_match(pattern, 0)

df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  paged_table()

New total permissions column

pattern <- "additional_info.androguard.RiskIndicator.PERM"

df_without_permissions <- 
  df %>% 
  select(., -(str_which(colnames(.), pattern)))

df_permissions <- 
  df %>% 
  select(., str_which(colnames(.), pattern)) %>% 
  mutate(., total_PERMs = rowSums(.))

df <- cbind(df_without_permissions, df_permissions)

View results

Sort columns.

df <- 
  sort_cols(df)

View results.

Save dataframe

After all preprocessing let’s save it into CSV.

write.csv(df, path)

Functions for preprocessing

As factor

labels <- 
  function(n){
    if(n == 5){
      return(c("very low", "low", "medium", "high", "very high"))
    }else if(n == 4){
      return(c("very low", "low", "high", "very high"))
    }else if(n == 3){
      return(c("low", "medium", "high"))
    }else if(n == 2){
      return(c("low", "high"))
    }else{
      stop("Not avalible")
    }
  }

cut_by_quantiles <- 
  function(col){
    quantiles <- 
      col %>% 
      quantile(na.rm = TRUE) %>% 
      unique()
    if(length(quantiles) > 2){
      col <- 
        col %>% 
        cut(breaks = quantiles, 
            labels = labels(length(quantiles)-1),
            include.lowest = TRUE)
    }
    return(col)
  }

df_cut_by_quantiles <- 
  function(df){
    df_without_numeric <- 
      df[sapply(df, function(col) !is.numeric(col))]
    df_numeric <- 
      df %>% 
      select_if(is.numeric)
    df_numeric <- 
      df_numeric %>% 
      lapply(cut_by_quantiles)
    return(cbind(df_without_numeric, df_numeric))
  }